{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "54f057f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install sacrebleu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a91d6c3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "027c918e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sacrebleu.metrics import BLEU, CHRF, TER\n",
    "\n",
    "bleu = BLEU()\n",
    "chrf = CHRF(word_order = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a92c862b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Config, TFT5Model, T5Model, load_tf_weights_in_t5, T5Tokenizer\n",
    "from transformers import T5ForConditionalGeneration, TFT5ForConditionalGeneration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "db0bd062",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4bffc827dbfd4c0c9f34b078daa8e63e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading config.json:   0%|          | 0.00/769 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "56fca6972aa2495ca8ac3230539694cc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading tf_model.h5:   0%|          | 0.00/231M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-08-01 01:38:36.798298: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
      "2022-08-01 01:38:36.798336: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31\n",
      "2022-08-01 01:38:36.798340: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31\n",
      "2022-08-01 01:38:36.798533: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.129.6\n",
      "2022-08-01 01:38:36.798549: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6\n",
      "2022-08-01 01:38:36.798552: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.129.6\n",
      "2022-08-01 01:38:36.798883: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-08-01 01:38:36.807341: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
      "All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.\n",
      "\n",
      "All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at mesolitica/t5-small-finetuned-noisy-en-ms.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "model_tf = TFT5ForConditionalGeneration.from_pretrained('mesolitica/t5-small-finetuned-noisy-en-ms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a7c8e321",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ec0ef119234f4935b4a3c95333d13de2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading spiece.model:   0%|          | 0.00/784k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c15db78155ce4a33a6f4db7e616f4ef7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading special_tokens_map.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "345c15e17d6f43ef99dc53a03a8f0ae7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading tokenizer_config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-finetuned-noisy-en-ms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "78995f5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "\n",
    "with open('eng_Latn.dev') as fopen:\n",
    "    eng = fopen.read().split('\\n')\n",
    "    \n",
    "with open('zsm_Latn.dev') as fopen:\n",
    "    ms = fopen.read().split('\\n')\n",
    "    \n",
    "right = [unidecode(s) for s in ms]\n",
    "left = [unidecode(s) for s in eng]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8547c53f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(998, 998)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ms), len(eng)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3b2008a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|                                                                                                         | 0/63 [00:00<?, ?it/s]2022-08-01 01:38:44.700456: I tensorflow/compiler/xla/service/service.cc:171] XLA service 0xe924dc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
      "2022-08-01 01:38:44.700478: I tensorflow/compiler/xla/service/service.cc:179]   StreamExecutor device (0): Host, Default Version\n",
      "2022-08-01 01:38:44.711646: I tensorflow/compiler/jit/xla_compilation_cache.cc:363] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [02:59<00:00,  2.85s/it]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "batch_size = 16\n",
    "\n",
    "results = []\n",
    "for i in tqdm(range(0, len(left), batch_size)):\n",
    "    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'tf')[0]} for s in left[i:i + batch_size]]\n",
    "    padded = tokenizer.pad(input_ids, padding = 'longest')\n",
    "    outputs = model_tf.generate(**padded, max_length = 1000)\n",
    "    for o in outputs:\n",
    "        results.append(tokenizer.decode(o, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "88dc50e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "998"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "30dc51d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_left, filtered_right = [], []\n",
    "for no, r in enumerate(results):\n",
    "    if len(r):\n",
    "        filtered_left.append(r)\n",
    "        filtered_right.append(right[no])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d266a157",
   "metadata": {},
   "outputs": [],
   "source": [
    "refs = [filtered_right]\n",
    "sys = filtered_left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "266f0a14",
   "metadata": {},
   "outputs": [],
   "source": [
    "r = bleu.corpus_score(sys, refs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "63b0175b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'name': 'BLEU',\n",
       " 'score': 42.11032800535816,\n",
       " '_mean': -1.0,\n",
       " '_ci': -1.0,\n",
       " '_verbose': '73.2/50.3/35.8/25.8 (BP = 0.980 ratio = 0.980 hyp_len = 21597 ref_len = 22027)',\n",
       " 'bp': 0.9802867258527527,\n",
       " 'counts': [15815, 10355, 7025, 4803],\n",
       " 'totals': [21597, 20600, 19603, 18606],\n",
       " 'sys_len': 21597,\n",
       " 'ref_len': 22027,\n",
       " 'precisions': [73.22776311524748,\n",
       "  50.26699029126213,\n",
       "  35.836351578839974,\n",
       "  25.81425346662367],\n",
       " 'prec_str': '73.2/50.3/35.8/25.8',\n",
       " 'ratio': 0.9804785036546058}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4d85f832",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "chrF2++ = 66.31"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chrf.corpus_score(sys, refs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fe377ce",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
